Works via Google Colab
Let's connect directories and load useful libs.
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
cd /content/gdrive/'My Drive'/Colab_Notebooks/project_intern
!pip install pyLDAvis
import os
import numpy as np
import pandas as pd
from pprint import pprint
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
# vectorizing and so on
from sklearn.feature_extraction.text import CountVectorizer
import warnings
from tqdm import tqdm_notebook
PATH_TO_DATA = 'texts_after_preproc'
Loading the preprocessed small data
data = pd.read_csv(os.path.join(PATH_TO_DATA,'texts_part.csv'),index_col=0)
data
data_list = data.texts.values.tolist()
data_list[:4]
Vectorization
vectorizer = CountVectorizer(ngram_range=(1, 2),min_df=80,max_df =0.2)
matrix = vectorizer.fit_transform(data_list)
vectorizer.vocabulary_
Preparation
def vect2gensim(vectorizer, dtmatrix):
# transform sparse matrix into gensim corpus and dictionary
corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix, documents_columns=False)
dictionary = gensim.corpora.dictionary.Dictionary.from_corpus(corpus_vect_gensim,
id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))
return (corpus_vect_gensim, dictionary)
(gensim_corpus, gensim_dict) = vect2gensim(vectorizer, matrix)
def corpus2words_lists(gensim_corpus,gensim_dict):
#Converts corpus to texts in the form of words lists for gensim Coherence model
dictionary = dict(gensim_dict)
corpus_list = list(gensim_corpus)
texts_new = []
for text_meta in corpus_list:
text_new=[]
for pair in text_meta:
for counter in range(1,pair[1]+1):
text_new.append(dictionary[pair[0]])
texts_new.append(text_new)
return texts_new
data_words = corpus2words_lists(gensim_corpus,gensim_dict)
Firstly, I want to check the LSA model
%%time
# Build LSA model
lsamodel = gensim.models.LsiModel(gensim_corpus, num_topics=15, id2word = gensim_dict,power_iters=5)
# Compute Coherence Score
coherence_model_lsa = CoherenceModel(model=lsamodel, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)
This score is not good, let's optimize the number of topics.
%%time
warnings.filterwarnings("ignore")
numbers = [10,15,18,20,25]
coherences = []
for num_topics in tqdm_notebook(numbers):
lsamodel = gensim.models.LsiModel(gensim_corpus,
num_topics=num_topics,
id2word = gensim_dict,
power_iters=2)
coherence_model_lsa = CoherenceModel(model=lsamodel, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherences.append(coherence_model_lsa.get_coherence())
plt.plot(numbers,coherences)
plt.title('LSA model optimization')
plt.xlabel('Number of topics')
plt.ylabel('Coherence score')
plt.grid(True)
%%time
best_lsamodel = gensim.models.LsiModel(gensim_corpus, num_topics=5, id2word = gensim_dict,power_iters=30)
That's better, let's see the coherence score and the most important words
# Compute Coherence Score
coherence_model_lsa = CoherenceModel(model=best_lsamodel, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)
pprint(best_lsamodel.print_topics())
We have comparatively high coherence, but the topics are not interpretive. Let's try more iterations and topics.
best_lsamodel = gensim.models.LsiModel(gensim_corpus, num_topics=15, id2word = gensim_dict,power_iters=30)
pprint(best_lsamodel.print_topics())
I think, that LSA model is not a good fit for our task. Let's try LDA model.
%%time
# Build LDA model
warnings.filterwarnings("ignore")
lda_model = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=15,
random_state=100,
workers = 3,
passes=1
)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
pprint(lda_model.print_topics())
We have much better quality. Let's optimize.
%%time
warnings.filterwarnings("ignore")
numbers = [10,15,18,20,25]
coherences = []
for num_topics in tqdm_notebook(numbers):
lda_model = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=num_topics,
random_state=100,
workers = 3,
passes=1
)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherences.append(coherence_model_lda.get_coherence())
plt.plot(numbers,coherences)
plt.title('LDA model optimization')
plt.xlabel('Number of topics')
plt.ylabel('Coherence score')
plt.grid(True)
We know, that 10 topics is too little, that is why I will choose 20 topics and see results.
%%time
best_lda_model = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=20,
random_state=100,
workers = 3,
passes=5
)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=best_lda_model, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
pprint(best_lda_model.print_topics())
We have good topics. Let's see some pictures.
def plot_difference(mdiff, title="", annotation=None):
"""Helper function to plot difference between models.
Uses matplotlib as the backend."""
fig, ax = plt.subplots(figsize=(18, 14))
data = ax.imshow(mdiff, cmap='RdBu_r', origin='lower')
plt.title(title)
plt.colorbar(data)
mdiff, annotation = best_lda_model.diff(best_lda_model, distance='jaccard', num_words=50)
plot_difference(mdiff, title="Topic difference (one model) [jaccard distance]", annotation=annotation)
Our topics are not dependent, but let's cut the number of topics to 15 further. Let's see the topics.
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(best_lda_model, gensim_corpus, gensim_dict)
vis
Checking the parameters of The prior Dirichlet distribution.
best_lda_model.alpha
best_lda_model.eta
Good. Now, I want to tune parameters of the distributions to find the best sparsity.
%%time
best_lda_model1 = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=15,
random_state=100,
workers = 3,
passes=1
)
best_lda_model1.alpha
best_lda_model1.eta
data_list[0]
best_lda_model1.get_document_topics(gensim_corpus[0])
# Compute Coherence Score
coherence_model_lda1 = CoherenceModel(model=best_lda_model1, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()
print('Coherence Score: ', coherence_lda1)
pprint(best_lda_model1.print_topics())
mdiff, annotation = best_lda_model1.diff(best_lda_model1, distance='jaccard', num_words=50)
plot_difference(mdiff, title="Topic difference (one model) [jaccard distance]", annotation=annotation)
# Visualize the topics
pyLDAvis.enable_notebook()
vis1 = pyLDAvis.gensim.prepare(best_lda_model1, gensim_corpus, gensim_dict)
vis1
%%time
best_lda_model2 = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=15,
random_state=100,
workers = 3,
passes=1,
eta = 0.2,
alpha = 0.2
)
data_list[0]
best_lda_model2.get_document_topics(gensim_corpus[0])
# Compute Coherence Score
coherence_model_lda2 = CoherenceModel(model=best_lda_model2, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lda2 = coherence_model_lda2.get_coherence()
print('Coherence Score: ', coherence_lda2)
pprint(best_lda_model2.print_topics())
mdiff, annotation = best_lda_model2.diff(best_lda_model2, distance='jaccard', num_words=50)
plot_difference(mdiff, title="Topic difference (one model) [jaccard distance]", annotation=annotation)
# Visualize the topics
pyLDAvis.enable_notebook()
vis2 = pyLDAvis.gensim.prepare(best_lda_model2, gensim_corpus, gensim_dict)
vis2
%%time
best_lda_model3 = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=15,
random_state=100,
workers = 3,
passes=1,
eta = 10,
alpha = 10
)
data_list[0]
best_lda_model3.get_document_topics(gensim_corpus[0])
# Compute Coherence Score
coherence_model_lda3 = CoherenceModel(model=best_lda_model3, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lda3 = coherence_model_lda3.get_coherence()
print('Coherence Score: ', coherence_lda3)
pprint(best_lda_model3.print_topics())
mdiff, annotation = best_lda_model3.diff(best_lda_model3, distance='jaccard', num_words=50)
plot_difference(mdiff, title="Topic difference (one model) [jaccard distance]", annotation=annotation)
These parameters cause too little sparsity, this not good for the quality of topics, the difference is insignificant.
# Visualize the topics
pyLDAvis.enable_notebook()
vis3 = pyLDAvis.gensim.prepare(best_lda_model3, gensim_corpus, gensim_dict)
vis3
Let's increase sparsity to create independent interpretive topics.
%%time
best_lda_model4 = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=15,
random_state=100,
workers = 3,
passes=1,
eta = 0.01,
alpha = 0.01
)
data_list[0]
best_lda_model4.get_document_topics(gensim_corpus[0])
# Compute Coherence Score
coherence_model_lda4 = CoherenceModel(model=best_lda_model4, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lda4 = coherence_model_lda4.get_coherence()
print('Coherence Score: ', coherence_lda4)
pprint(best_lda_model4.print_topics())
mdiff, annotation = best_lda_model4.diff(best_lda_model4, distance='jaccard', num_words=50)
plot_difference(mdiff, title="Topic difference (one model) [jaccard distance]", annotation=annotation)
# Visualize the topics
pyLDAvis.enable_notebook()
vis4 = pyLDAvis.gensim.prepare(best_lda_model4, gensim_corpus, gensim_dict)
vis4
%%time
best_lda_model5 = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=15,
random_state=100,
workers = 3,
passes=1,
eta = 0.005,
alpha = 0.01
)
data_list[0]
best_lda_model5.get_document_topics(gensim_corpus[0])
# Compute Coherence Score
coherence_model_lda5 = CoherenceModel(model=best_lda_model5, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lda5 = coherence_model_lda5.get_coherence()
print('Coherence Score: ', coherence_lda5)
pprint(best_lda_model5.print_topics())
mdiff, annotation = best_lda_model5.diff(best_lda_model5, distance='jaccard', num_words=50)
plot_difference(mdiff, title="Topic difference (one model) [jaccard distance]", annotation=annotation)
# Visualize the topics
pyLDAvis.enable_notebook()
vis5 = pyLDAvis.gensim.prepare(best_lda_model5, gensim_corpus, gensim_dict)
vis5
So, we have some good sets of parameters, let's see the difference between the topics for 5 and 4 models. It is important to compare them.
mdiff, annotation = best_lda_model4.diff(best_lda_model5, distance='jaccard', num_words=50)
plot_difference(mdiff, title="Topic difference (two model) [jaccard distance]", annotation=annotation)
So, let's see the difference in practice.
pprint(best_lda_model4.print_topics())
data_list[800]
best_lda_model1.get_document_topics(gensim_corpus[800])
best_lda_model4.get_document_topics(gensim_corpus[800])
best_lda_model5.get_document_topics(gensim_corpus[800])
data_list[80000]
best_lda_model1.get_document_topics(gensim_corpus[80000])
best_lda_model4.get_document_topics(gensim_corpus[80000])
best_lda_model5.get_document_topics(gensim_corpus[80000])
data_list[10000]
best_lda_model1.get_document_topics(gensim_corpus[10000])
best_lda_model4.get_document_topics(gensim_corpus[10000])
best_lda_model5.get_document_topics(gensim_corpus[10000])
data_list[100000]
best_lda_model1.get_document_topics(gensim_corpus[100000])
best_lda_model4.get_document_topics(gensim_corpus[100000])
best_lda_model5.get_document_topics(gensim_corpus[100000])
From my point of view, the 4-th model is the best. Let's increase the number of passes.
%%time
best_lda_model_final = gensim.models.LdaMulticore(corpus=gensim_corpus,
id2word=gensim_dict,
num_topics=15,
random_state=100,
workers = 3,
passes=3,
eta = 0.01,
alpha = 0.01
)
pprint(best_lda_model_final.print_topics())
best_lda_model_final.get_document_topics(gensim_corpus[800])
best_lda_model_final.get_document_topics(gensim_corpus[80000])
best_lda_model_final.get_document_topics(gensim_corpus[10000])
best_lda_model_final.get_document_topics(gensim_corpus[100000])
# Compute Coherence Score
coherence_model_lda_final = CoherenceModel(model=best_lda_model_final, texts=data_words, dictionary=gensim_dict, coherence='c_v')
coherence_lda_final = coherence_model_lda_final.get_coherence()
print('Coherence Score: ', coherence_lda_final)
# Visualize the topics
pyLDAvis.enable_notebook()
vis_final = pyLDAvis.gensim.prepare(best_lda_model_final, gensim_corpus, gensim_dict)
vis_final